# encoding: utf-8
'''
Created on 2017年12月18日

@author: DELL
'''
import json

MEMORY = dict()
KNOWLEDGE_BASE = []
KNOWLEDGE_WORD = dict()
KNOWLEDGE_WORD_INNER = dict()
KNOWLEDGE_SENTENCE = dict()


def learn_k_base(sentence):
    i = 0
    for ch in sentence:
        if ch in KNOWLEDGE_BASE:
            print("I know %s" % ch)
            if sentence in KNOWLEDGE_SENTENCE[ch]:
                print("known sentence %s" % sentence)
            else:
                KNOWLEDGE_SENTENCE[ch].append(sentence)
        else:
            KNOWLEDGE_BASE.append(ch)
            print("Learn new %s" % ch)
            KNOWLEDGE_SENTENCE[ch] = [sentence]
        # print("[%s]%s" % (i, ch))
        learn_k_word(ch, sentence)
        i += 1
    

def learn_k_word(ch, new_sentence):
    sentence_list = KNOWLEDGE_SENTENCE[ch]
    for s in sentence_list:
        if s != new_sentence:
            find_word(ch, s, s.find(ch), new_sentence, new_sentence.find(ch))
            
            
def find_word(ch, s1, k1, s2, k2):
    # print("---- find word:%s, %s, %s" % (ch, s1[k1:], s2[k2:]) )
    if k1 < 0 or k2 < 0:
        return
    l1 = len(s1)
    l2 = len(s2)
    # 从ch后一个字符开始
    count = 1
    while l1 > k1+count and l2 > k2+count :
        if s1[k1+count] == s2[k2+count]:
            count += 1
        else:
            break
    if count > 1:
        word = s1[k1:k1+count]
        print("---- find :%s" % word)
        if word == "子计算机":
            print(k1, s1)
            print(k2, s2)
            print(json.dumps(KNOWLEDGE_WORD_INNER, ensure_ascii=False))
            print(json.dumps(KNOWLEDGE_WORD, ensure_ascii=False))
        # 检查word是否是内部词语，在两个句子中任意一个不是即认为非内部词语
        # 作为非内部词语，也可以同时是内部词语
        if k1 > 0 and k2 > 0 and is_inner_word(s1[k1-1], word) and is_inner_word(s2[k2-1], word):
            if ch in KNOWLEDGE_WORD_INNER.keys():
                if word not in KNOWLEDGE_WORD_INNER[ch]:
                    KNOWLEDGE_WORD_INNER[ch].append(word)
                    print("new inner word:%s" % word)
            else:
                KNOWLEDGE_WORD_INNER[ch] = [word]
                print("new inner word:%s" % word)
        else:
            if ch in KNOWLEDGE_WORD.keys():
                if word not in KNOWLEDGE_WORD[ch]:
                    KNOWLEDGE_WORD[ch].append(word)
                    print("new word:%s" % word)
            else:
                KNOWLEDGE_WORD[ch] = [word]
                print("new word:%s" % word)
    # 输入句先不遍历
#     if l1 > k1+count:
#         new_k = s1[k1+count:].find(ch)
#         if new_k >= 0:
#             find_word(ch, s1, k1+count+new_k, s2, k2)
    if l2 > k2+count:
        new_k = s2[k2+count:].find(ch)
        if new_k >= 0:
            find_word(ch, s1, k1, s2, k2+count+new_k)
        

def is_inner_word(pre_ch, word):
    rtn = False
    if pre_ch in KNOWLEDGE_WORD_INNER.keys():
        if (pre_ch + word) in KNOWLEDGE_WORD_INNER[pre_ch]:
            rtn = True
    if pre_ch in KNOWLEDGE_WORD.keys():
        if (pre_ch + word) in KNOWLEDGE_WORD[pre_ch]:
            rtn = True
    return rtn

if __name__ == '__main__':
    import  sys   
    sys.setrecursionlimit(1000000)
    scene = []
    scene =  [
        "潘建伟教授及其同事陆朝阳、朱晓波等，联合浙江大学王浩华教授研究组，在基于光子和超导体系的量子计算机研究方面取得了系列突破性进展。",
        "2017年5月3日，该研究团队在上海正式发布了这一系列研究成果。潘建伟教授在现场宣布，在光学体系，研究团队在去年首次实现十光子纠缠操纵的基础上，利用高品质量子点单光子源构建了世界首台超越早期经典计算机的单光子量子计算机"
    ]
    for s in scene:
        learn_k_base(s)
        print("------------------------\n", json.dumps(scene, indent=4, ensure_ascii=False))
        print("------------------------\n", json.dumps(KNOWLEDGE_WORD_INNER, ensure_ascii=False))
        print("------------------------\n", json.dumps(KNOWLEDGE_WORD, indent=4, ensure_ascii=False))
#     while(not scene or not (scene[-1] == "end")):
#         s = input("请输入：");
#         # print ("你输入的内容是：", s)
#         scene.append(s)
#         # print(scene)
#         learn_k_base(s)
#         print("------------------------\n", json.dumps(scene, indent=4, ensure_ascii=False))
#         print("------------------------\n", json.dumps(KNOWLEDGE_WORD_INNER, ensure_ascii=False))
#         print("------------------------\n", json.dumps(KNOWLEDGE_WORD, indent=4, ensure_ascii=False))

        # 天然分割























